library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(tidyr)
library(stringr)
library(ggplot2)
library(fdrtool)
library(corrplot)
## corrplot 0.88 loaded

Sample sizes

meta_qc_align <- read_tsv(snakemake@input$meta_qc_align)
## 
## ── Column specification ────────────────────────────────────────────────────────────────────────
## cols(
##   cohort = col_character(),
##   ancestries = col_character(),
##   release = col_character(),
##   N_cases = col_double(),
##   N_controls = col_double(),
##   N_snps = col_double(),
##   N_snps_aligned = col_double(),
##   median_or = col_double(),
##   max_OR = col_double(),
##   mean_SE = col_double(),
##   max_SE = col_double()
## )
cohorts_mdd <- read_tsv(snakemake@input$cohorts_mdd)
## Warning: Missing column names filled in: 'X7' [7]
## 
## ── Column specification ────────────────────────────────────────────────────────────────────────
## cols(
##   Dataset = col_character(),
##   N_cases = col_double(),
##   N_controls = col_double(),
##   `LAMBDA-GC` = col_double(),
##   `N-SNPs` = col_double(),
##   N_eff_half = col_double(),
##   X7 = col_logical()
## )
cohorts_samples <- cohorts_mdd %>%
filter(Dataset != 'SUM') %>%
mutate(cohort=str_match(Dataset, "mdd_(.+)_eur")[,2])

meta_samples <- 
meta_qc_align %>%
filter(!cohort %in% c('MDD29', 'PGC')) %>%
bind_rows(cohorts_samples) %>%
group_by(cohort) %>%
summarize(Cases=sum(N_cases), Controls=sum(N_controls)) %>%
pivot_longer(Cases:Controls, names_to="MDD", values_to="N") %>%
arrange(desc(N))

cohort_order <-
meta_samples %>%
filter(MDD == 'Cases') %>%
arrange(desc(N)) %>%
pull(cohort)

ggplot(meta_samples, aes(x=0, y=0, color=MDD, size=N)) +
geom_point() +
facet_wrap(~factor(cohort, levels=cohort_order)) +
scale_size_area(max_size=33) +
theme_minimal() +
theme(axis.line=element_blank(),
  axis.text.x=element_blank(),
  axis.text.y=element_blank(),
  axis.ticks=element_blank(),
  axis.title.x=element_blank(),
  axis.title.y=element_blank())

Reference panel alignment

Marker names were aligned to the meta-analysis reference panel and effect sizes were checked. SNPs with MAF < 0.01 in the imputation reference panel and INFO < 0.1 in the cohort were removed.

meta_qc_align %>% print(n=Inf)
## # A tibble: 32 x 11
##    cohort  ancestries release N_cases N_controls N_snps N_snps_aligned median_or
##    <chr>   <chr>      <chr>     <dbl>      <dbl>  <dbl>          <dbl>     <dbl>
##  1 23andMe eas        v7_2       2727      90220 5.67e6        4589798     1    
##  2 23andMe eur        v7_2_2…  112892    1773938 7.50e6        6035449     1    
##  3 AGDS    eur        202012    12123      12684 7.62e6        7582482     1.00 
##  4 Airwave eur        0820       2100      15713 7.49e6        7472104     1.00 
##  5 ALSPAC  eur        120820…     472       3475 9.05e6        5616108     1.00 
##  6 BASIC   eur        202011     1003       1854 7.77e6        6718587     0.999
##  7 BioVU   eur        Cov_SA…    7757      24723 6.26e6        5816204     1.00 
##  8 BioVU   eur        NoCov_…    7757      24723 6.26e6        5816204     1.00 
##  9 DBDS    eur        FINAL2…   13347     145996 7.59e6        6623408     1.00 
## 10 deCODE  eur        DEPALL…   20000      28000 8.84e6        7322173     1    
## 11 ESTBB   eur        EstBB     35473      91301 2.69e7        7603277     1.00 
## 12 EXCEED  eur        202010      580       2071 8.08e6        5948847     0.999
## 13 FinnGen eur        R5_180…   23424     192220 1.64e7        7626331     1.00 
## 14 GenScot eur        1215a       997       6358 7.71e6        6371459     1.00 
## 15 GERA    eur        0915a_…    7162      38307 1.09e7        7855034     1    
## 16 HUNT    eur        gp_hos…   11658      42535 8.69e6        7794399     1.00 
## 17 iPSYCH  eur        2012_H…   19156      22708 8.81e6        7664074     1    
## 18 iPSYCH  eur        2015i_…   10002      15434 8.85e6        7683643     1    
## 19 lgic2   eur        202011      906       4717 7.76e6        6576356     0.999
## 20 MDD29   eur        0120a_…   16674      25452 9.75e6        7962860     1.00 
## 21 MDD49   eur        29w2_2…   28147      48033 7.71e6        7677489     0.998
## 22 MoBa    eur        harves…     603      10213 6.50e6        5414660     1.00 
## 23 MoBa    eur        harves…     367       6122 6.50e6        4821265     1.00 
## 24 MoBa    eur        rotter…     553       8860 6.50e6        5324929     1.00 
## 25 MVP     eur        4_0ICD…  151974     226640 6.08e6        5269999     1    
## 26 PBK     eur        2020       5607      16080 7.35e6        7311190     1    
## 27 PGC     eur        wray20…  135458     344901 1.36e7        7941764     1    
## 28 PREFECT eur        run1       1796       3290 8.96e6        7529319     0.999
## 29 SHARE   eur        godart…    1063       1921 1.36e7        6534886     1    
## 30 STAGE   eur        MDDdx_…     421       9134 7.43e6        5248073     0.999
## 31 tkda1   eur        run1        672        846 8.86e6        6135331     0.998
## 32 UKBB    eur        MD_glm…   41500     319630 9.10e6        7396841     1    
## # … with 3 more variables: max_OR <dbl>, mean_SE <dbl>, max_SE <dbl>

Median odds-ratios with standard error

ggplot(meta_qc_align %>%
       unite('cohort',
              cohort, ancestries, release, 
              sep="."),
        aes(x=reorder(cohort, (4*N_cases*N_controls)/(N_cases + N_controls)),
            y=median_or,
            ymax=exp(log(median_or)+mean_SE),
            ymin=exp(log(median_or)-mean_SE))) +
geom_linerange() +
geom_point() +
coord_flip(ylim=c(0.8, 1.25))

Mean standard error versus effective sample size:

ggplot(meta_qc_align %>%
       unite('cohort',
              cohort, ancestries, release, 
              sep=".") %>%
       mutate(Neff=(4*N_cases*N_controls)/(N_cases + N_controls)),
        aes(x=Neff, y=mean_SE, label=cohort)) +
geom_text(check_overlap=TRUE, size=2) +
scale_x_log10(breaks=c(1, 100, 1000, 5000, 10000, 50000, 100000, 500000))

Genetic correlation with clinical cohorts

LDSC genetic correlations were calculated with the PGC MDD29 clinical cohorts

meta_qc_ldsc <-
read_table2(snakemake@input$meta_qc_ldsc) %>%
mutate(se.mdd2=as.numeric(str_remove_all(se.mdd2, "[\\(\\)]")),
 se.mdd29=as.numeric(str_remove_all(se.mdd29, "[\\(\\)]")))
## 
## ── Column specification ────────────────────────────────────────────────────────────────────────
## cols(
##   cohort = col_character(),
##   release = col_character(),
##   rg.mdd2 = col_double(),
##   se.mdd2 = col_character(),
##   gencov.mdd2 = col_double(),
##   rg.mdd29 = col_double(),
##   se.mdd29 = col_character(),
##   gencov.mdd29 = col_double()
## )
## Warning: 3 parsing failures.
## row col  expected    actual                           file
##   4  -- 8 columns 5 columns 'docs/tables/meta_qc_ldsc.txt'
##   5  -- 8 columns 5 columns 'docs/tables/meta_qc_ldsc.txt'
##  20  -- 8 columns 5 columns 'docs/tables/meta_qc_ldsc.txt'
ggplot(meta_qc_ldsc, aes(x=reorder(paste(cohort, release), rg.mdd29), y=rg.mdd29, ymin=rg.mdd29-se.mdd29, ymax=rg.mdd29+se.mdd29)) +
geom_linerange() +
geom_point() +
coord_flip(ylim=c(0, 1.25))
## Warning: Removed 8 rows containing missing values (geom_segment).
## Warning: Removed 8 rows containing missing values (geom_point).

Genetic covariance intercepts

Pairwise LDSC genetic covariance intercepts between all cohorts

meta_qc_ldsc_pairs <- read_tsv(snakemake@input$meta_qc_ldsc_pairs)
## 
## ── Column specification ────────────────────────────────────────────────────────────────────────
## cols(
##   cohort1 = col_character(),
##   subcohort1 = col_character(),
##   cohort2 = col_character(),
##   subcohort2 = col_character(),
##   ancestry1 = col_character(),
##   ancestry2 = col_character(),
##   rg = col_double(),
##   se = col_double(),
##   z = col_double(),
##   p = col_double(),
##   h2_obs = col_double(),
##   h2_obs_se = col_double(),
##   h2_int = col_double(),
##   h2_int_se = col_double(),
##   gcov_int = col_double(),
##   gcov_int_se = col_double()
## )
# calculate total sample sizes
meta_qc_samplesize <- meta_qc_align %>%
transmute(cohort, ancestries, release, N=N_cases+N_controls)

# sumstats appearing in MDD3 (exclude other
# sumstats like Wray 2018 that have had their 
# sumstats munged for other reasons)
meta_qc_ldsc_pairs_mdd3 <- meta_qc_ldsc_pairs  %>%
filter(!cohort1 %in% c('MDD29', 'PGC') & !cohort2 %in% c('MDD29', 'PGC')) %>%
left_join(meta_qc_samplesize %>% rename(cohortN1=N), by=c('cohort1'='cohort', 'subcohort1'='release', 'ancestry1'='ancestries')) %>%
left_join(meta_qc_samplesize %>% rename(cohortN2=N), by=c('cohort2'='cohort', 'subcohort2'='release', 'ancestry2'='ancestries'))

Histogram of intercepts

ggplot(meta_qc_ldsc_pairs_mdd3, aes(x=gcov_int)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 44 rows containing non-finite values (stat_bin).

Pairwise intercepts

Show the largest and smallest intercepts. Exclude Wray2018 sumstats that are also in the mix. Calculate distance from 0 in standard deviation units to get an idea of the magnitude of departure from expectation relative to the other intercepts.

meta_qc_ldsc_pairs_mdd3 %>%
select(cohort1, subcohort1, cohort2, subcohort2, gcov_int) %>%
mutate(SDs=abs(gcov_int)/sd(gcov_int, na.rm=T)) %>%
arrange(desc(gcov_int))
## # A tibble: 378 x 6
##    cohort1 subcohort1                     cohort2 subcohort2      gcov_int   SDs
##    <chr>   <chr>                          <chr>   <chr>              <dbl> <dbl>
##  1 23andMe v7_2_202012                    iPSYCH  2012_HRC          0.0219  3.46
##  2 MoBa    harvest12                      MoBa    rotterdam1        0.0213  3.36
##  3 MDD49   29w2_20w3_1504                 tkda1   run1              0.0212  3.35
##  4 HUNT    gp_hospital_metacarpa_20190625 MoBa    harvest12         0.0203  3.21
##  5 HUNT    gp_hospital_metacarpa_20190625 iPSYCH  2015i_HRC         0.0165  2.61
##  6 HUNT    gp_hospital_metacarpa_20190625 MVP     4_0ICDdep_2021…   0.0165  2.61
##  7 MVP     4_0ICDdep_202106               PBK     2020              0.015   2.37
##  8 AGDS    202012                         iPSYCH  2012_HRC          0.0147  2.32
##  9 HUNT    gp_hospital_metacarpa_20190625 iPSYCH  2012_HRC          0.0146  2.31
## 10 MDD49   29w2_20w3_1504                 PREFECT run1              0.0144  2.27
## # … with 368 more rows
meta_qc_ldsc_pairs_mdd3 %>%
select(cohort1, subcohort1, cohort2, subcohort2, gcov_int) %>%
mutate(SDs=abs(gcov_int)/sd(gcov_int, na.rm=T)) %>%
arrange(gcov_int)
## # A tibble: 378 x 6
##    cohort1 subcohort1          cohort2 subcohort2          gcov_int   SDs
##    <chr>   <chr>               <chr>   <chr>                  <dbl> <dbl>
##  1 GERA    0915a_mds5          MoBa    harvest12            -0.0172  2.72
##  2 MVP     4_0ICDdep_202106    lgic2   202011               -0.0127  2.01
##  3 23andMe v7_2_202012         BASIC   202011               -0.0123  1.94
##  4 23andMe v7_2_202012         Airwave 0820                 -0.012   1.90
##  5 SHARE   godartsshare_842021 iPSYCH  2012_HRC             -0.0116  1.83
##  6 EXCEED  202010              MoBa    harvest12            -0.0109  1.72
##  7 GERA    0915a_mds5          iPSYCH  2012_HRC             -0.0109  1.72
##  8 23andMe v7_2_202012         BioVU   NoCov_SAIGE_051821   -0.0106  1.67
##  9 MoBa    harvest12           SHARE   godartsshare_842021  -0.0106  1.67
## 10 23andMe v7_2_202012         ALSPAC  12082019             -0.0103  1.63
## # … with 368 more rows

Estimate amount of sample overlap as \(N_S = g_{\mathrm{cov}_\mathrm{int}}\sqrt{N_1N_2} / r_\mathrm{P}\), where \(r_\mathrm{P}\) is the phenotypic correlation between the two MDD phenotype (assume \(r_\mathrm{P} = 1\))

rP <- 1
meta_qc_ldsc_pairs_mdd3_ns <-
meta_qc_ldsc_pairs_mdd3 %>%
mutate(Ns_factor=sqrt(cohortN1*cohortN2)/rP) %>%
mutate(Ns=gcov_int*Ns_factor,
       Ns_l95=(gcov_int+qnorm(0.025)*gcov_int_se)*Ns_factor,
       Ns_u95=(gcov_int+qnorm(0.975)*gcov_int_se)*Ns_factor,
       chisq=gcov_int^2/gcov_int_se^2) %>%
filter(!is.na(chisq)) %>%
mutate(qval=fdrtool(pchisq(chisq, df=1, lower.tail=F), statistic='pvalue', plot=FALSE)$qval) %>%
select(cohort1, subcohort1, cohort2, subcohort2, cohortN1, cohortN2, chisq, qval, Ns, Ns_l95, Ns_u95) %>%
mutate(Ns_pct=100*Ns/(cohortN1+cohortN2))
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr

Sort (largest-to-smallest) by sample overlap

meta_qc_ldsc_pairs_mdd3_ns %>%
arrange(desc(Ns)) %>%
select(-chisq, -qval, -Ns_pct)
## # A tibble: 334 x 9
##    cohort1 subcohort1  cohort2 subcohort2 cohortN1 cohortN2    Ns  Ns_l95 Ns_u95
##    <chr>   <chr>       <chr>   <chr>         <dbl>    <dbl> <dbl>   <dbl>  <dbl>
##  1 23andMe v7_2_202012 iPSYCH  2012_HRC    1886830    41864 6155.  1.03e3 11278.
##  2 23andMe v7_2_202012 ESTBB   EstBB       1886830   126774 5478. -1.90e3 12859.
##  3 23andMe v7_2_202012 MVP     4_0ICDdep…  1886830   378614 4818. -1.27e4 22377.
##  4 23andMe v7_2_202012 UKBB    MD_glm_20…  1886830   361130 4788. -7.99e3 17569.
##  5 MVP     4_0ICDdep_… UKBB    MD_glm_20…   378614   361130 3106. -2.84e3  9049.
##  6 23andMe v7_2_202012 GERA    0915a_mds5  1886830    45469 2519. -1.90e3  6939.
##  7 HUNT    gp_hospita… MVP     4_0ICDdep…    54193   378614 2363.  3.70e2  4357.
##  8 FinnGen R5_18032020 MVP     4_0ICDdep…   215644   378614 2229. -1.86e3  6317.
##  9 MDD49   29w2_20w3_… UKBB    MD_glm_20…    76180   361130 2189.  4.38e1  4335.
## 10 ESTBB   EstBB       UKBB    MD_glm_20…   126774   361130 1990.  1.03e2  3877.
## # … with 324 more rows

Sort (largest-to-smallest) by percentage of overlap to combined sample size

meta_qc_ldsc_pairs_mdd3_ns %>%
arrange(desc(Ns_pct)) %>%
select(-cohortN1, -cohortN2, -chisq, -qval)
## # A tibble: 334 x 8
##    cohort1 subcohort1            cohort2 subcohort2      Ns Ns_l95 Ns_u95 Ns_pct
##    <chr>   <chr>                 <chr>   <chr>        <dbl>  <dbl>  <dbl>  <dbl>
##  1 MoBa    harvest12             MoBa    rotterdam1   215.  102.     328.  1.06 
##  2 HUNT    gp_hospital_metacarp… iPSYCH  2015i_HRC    613.  220.    1006.  0.769
##  3 HUNT    gp_hospital_metacarp… MoBa    harvest12    491.  183.     800.  0.756
##  4 HUNT    gp_hospital_metacarp… iPSYCH  2012_HRC     695.   88.6   1302.  0.724
##  5 AGDS    202012                iPSYCH  2012_HRC     474.   50.5    897.  0.711
##  6 PREFECT run1                  STAGE   MDDdx_saige   89.2   9.98   168.  0.609
##  7 STAGE   MDDdx_saige           lgic2   202011        92.4  16.2    168.  0.608
##  8 iPSYCH  2012_HRC              iPSYCH  2015i_HRC    401.    4.84   798.  0.596
##  9 GenScot 1215a                 PBK     2020         172.   52.9    291.  0.591
## 10 GERA    0915a_mds5            MDD49   29w2_20w3_1… 712.   43.1   1381.  0.585
## # … with 324 more rows

Sort by \(\chi^2\) (Wald) test statistics

meta_qc_ldsc_pairs_mdd3_ns %>%
arrange(desc(chisq)) %>%
select(-cohortN1, -cohortN2)
## # A tibble: 334 x 10
##    cohort1 subcohort1      cohort2 subcohort2  chisq   qval     Ns Ns_l95 Ns_u95
##    <chr>   <chr>           <chr>   <chr>       <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 MoBa    harvest12       MoBa    rotterdam1  14.0  0.0381  215.   102.    328.
##  2 MDD49   29w2_20w3_1504  tkda1   run1        13.4  0.0381  228.   106.    350.
##  3 HUNT    gp_hospital_me… MoBa    harvest12    9.75 0.150   491.   183.    800.
##  4 HUNT    gp_hospital_me… iPSYCH  2015i_HRC    9.34 0.166   613.   220.   1006.
##  5 GERA    0915a_mds5      UKBB    MD_glm_202…  8.07 0.226  1820.   564.   3075.
##  6 GenScot 1215a           PBK     2020         8.03 0.227   172.    52.9   291.
##  7 GERA    0915a_mds5      MoBa    harvest12    7.45 0.268  -381.  -655.   -108.
##  8 STAGE   MDDdx_saige     lgic2   202011       5.65 0.426    92.4   16.2   168.
##  9 23andMe v7_2_202012     iPSYCH  2012_HRC     5.55 0.434  6155.  1032.  11278.
## 10 HUNT    gp_hospital_me… MVP     4_0ICDdep_…  5.40 0.446  2363.   370.   4357.
## # … with 324 more rows, and 1 more variable: Ns_pct <dbl>

Test for heterogeniety in covariance intercepts. Calculate \(w\), the inverse variance of each \(g_\mathrm{covint}\) standard error, then calculate a weighted sum \(\hat{g_\mathrm{covint}}\)

meta_qc_ldsc_pairs_mdd3 %>%
filter(!is.na(gcov_int)) %>%
mutate(w=1/gcov_int_se^2) %>%
mutate(gcov_int_hat=sum(w*gcov_int)/sum(w)) %>%
summarise(Q=sum(w*(gcov_int-gcov_int_hat)^2), k=n()) %>%
mutate(I2=(Q-(k-1))/Q)
## # A tibble: 1 x 3
##       Q     k    I2
##   <dbl> <int> <dbl>
## 1  382.   334 0.128

There is thus some heterogeneity in genetic covariance intercepts. We also want to look at heterogeniety per-cohort. The LDSC intercepts were calculated for each unique pair of cohorts. Expand this table to have every ordering of each pair

# get unique cohort names from name and ancestry columns
meta_qc_ldsc_pairs_mdd3_named <-
meta_qc_ldsc_pairs_mdd3 %>%
mutate(cohort1=paste(cohort1, subcohort1, ancestry1, sep='.'),
       cohort2=paste(cohort2, subcohort2, ancestry2, sep='.')) %>%
mutate(pair_name=paste(cohort1, cohort2, sep=','))

# get all unique cohort names
cohorts <- unique(c(meta_qc_ldsc_pairs_mdd3_named$cohort1,
                    meta_qc_ldsc_pairs_mdd3_named$cohort2))

# ordered pairs of cohort we have data listings for
cohorts_pair_names <- unique(meta_qc_ldsc_pairs_mdd3_named$pair_name)

# create list of all pairs of cohorts
meta_qc_ldsc_pairs_mdd3_all <-
tibble(cohort1=cohorts, cohort2=cohorts) %>%
complete(cohort1, cohort2) %>%
filter(cohort1 != cohort2) %>%
mutate(pair_name12=paste(cohort1, cohort2, sep=','),
       pair_name21=paste(cohort2, cohort1, sep=',')) %>%
mutate(pair_name=case_when(pair_name12 %in% cohorts_pair_names ~ pair_name12,
                           pair_name21 %in% cohorts_pair_names ~ pair_name21,
                           TRUE ~ NA_character_)) %>%
select(cohort1, cohort2, pair_name) %>%
left_join(meta_qc_ldsc_pairs_mdd3_named %>% select(pair_name, gcov_int, gcov_int_se), by='pair_name')

Calculate \(Q\) and \(I^2\) for each cohort

meta_qc_ldsc_pairs_mdd3_all_het <-
meta_qc_ldsc_pairs_mdd3_all %>%
group_by(cohort1) %>%
filter(!is.na(gcov_int)) %>%
mutate(w=1/gcov_int_se^2) %>%
mutate(gcov_int_hat=sum(w*gcov_int)/sum(w)) %>%
summarise(Q=sum(w*(gcov_int-gcov_int_hat)^2), k=n()) %>%
mutate(I2=(Q-(k-1))/Q) %>%
arrange(desc(I2))
meta_qc_ldsc_pairs_mdd3_all_het %>%
as.data.frame()
##                                    cohort1        Q  k          I2
## 1                       MoBa.harvest12.eur 46.54321 24  0.50583550
## 2                      GERA.0915a_mds5.eur 42.34190 24  0.45680289
## 3                           tkda1.run1.eur 24.89089 16  0.39736981
## 4                      iPSYCH.2012_HRC.eur 38.03735 26  0.34275121
## 5                      MoBa.rotterdam1.eur 37.60505 26  0.33519566
## 6                         BASIC.202011.eur 19.58703 15  0.28524146
## 7  HUNT.gp_hospital_metacarpa_20190625.eur 32.85551 25  0.26952888
## 8                         PREFECT.run1.eur 30.54272 24  0.24695631
## 9                    STAGE.MDDdx_saige.eur 31.48149 27  0.17411785
## 10                MDD49.29w2_20w3_1504.eur 28.67780 25  0.16311567
## 11                       GenScot.1215a.eur 27.35163 25  0.12253850
## 12                     ALSPAC.12082019.eur 22.65496 22  0.07305087
## 13                        lgic2.202011.eur 25.82344 25  0.07061171
## 14                 23andMe.v7_2_202012.eur 27.22148 27  0.04487205
## 15                            PBK.2020.eur 23.39237 24  0.01677345
## 16                         AGDS.202012.eur 23.29556 25 -0.03023908
## 17            BioVU.NoCov_SAIGE_051821.eur 22.16441 24 -0.03769944
## 18                    iPSYCH.2015i_HRC.eur 22.98309 25 -0.04424587
## 19                  UKBB.MD_glm_202012.eur 21.66286 25 -0.10788710
## 20           SHARE.godartsshare_842021.eur 19.85713 23 -0.10791446
## 21                MVP.4_0ICDdep_202106.eur 22.36432 27 -0.16256598
## 22                       EXCEED.202010.eur 20.26973 25 -0.18403178
## 23                    DBDS.FINAL202103.eur 20.34515 26 -0.22879436
## 24                 FinnGen.R5_18032020.eur 19.26253 26 -0.29785641
## 25           deCODE.DEPALL_FINAL_WHEAD.eur 17.50199 25 -0.37127287
## 26                         ESTBB.EstBB.eur 17.21966 25 -0.39375541
## 27                        Airwave.0820.eur 16.91729 26 -0.47777808
## 28                      MoBa.harvest24.eur  5.07686 11 -0.96972158
# find range to fit all gcov_int values
gcov_int_min <- min(with(meta_qc_ldsc_pairs_mdd3, gcov_int-gcov_int_se), na.rm=TRUE)
gcov_int_max <- max(with(meta_qc_ldsc_pairs_mdd3, gcov_int+gcov_int_se), na.rm=TRUE)

ggplot(meta_qc_ldsc_pairs_mdd3_all %>% filter(cohort1 %in% meta_qc_ldsc_pairs_mdd3_all_het$cohort1[1:4]),
       aes(x=cohort2, y=gcov_int, ymin=gcov_int-gcov_int_se, ymax=gcov_int+gcov_int_se)) +
geom_point() +
geom_linerange() +
facet_grid(cols=vars(cohort1)) +
coord_flip(ylim=c(gcov_int_min-0.01, gcov_int_max+0.01))
## Warning: Removed 18 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 3 rows containing missing values (geom_segment).
## Warning: Removed 11 rows containing missing values (geom_segment).

ggplot(meta_qc_ldsc_pairs_mdd3_all %>% filter(cohort1 %in% meta_qc_ldsc_pairs_mdd3_all_het$cohort1[5:8]),
   aes(x=cohort2, y=gcov_int, ymin=gcov_int-gcov_int_se, ymax=gcov_int+gcov_int_se)) +
geom_point() +
geom_linerange() +
facet_grid(cols=vars(cohort1)) +
coord_flip(ylim=c(gcov_int_min-0.01, gcov_int_max+0.01))
## Warning: Removed 18 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_segment).
## Warning: Removed 2 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 3 rows containing missing values (geom_segment).

Clustering

Cluster based on similarity in genetic covariance intercepts.

# get a list of all cohort/subcohort names

subcohorts <- 
bind_rows(
select(meta_qc_ldsc_pairs_mdd3, cohort=cohort1, subcohort=subcohort1),
select(meta_qc_ldsc_pairs_mdd3, cohort=cohort2, subcohort=subcohort2)
) %>%
distinct()

Make a matrix:

cohort_names <- 
subcohorts %>%
transmute(cohort=paste(cohort, subcohort, sep='.')) %>%
pull(cohort)

gcov_int_mat <- diag(length(cohort_names))
dimnames(gcov_int_mat) <- list(cohort_names, cohort_names)

for(i in seq.int(nrow(meta_qc_ldsc_pairs_mdd3))) {
    cohort1 <- meta_qc_ldsc_pairs_mdd3$cohort1[i]
    cohort2 <- meta_qc_ldsc_pairs_mdd3$cohort2[i]
    subcohort1 <- meta_qc_ldsc_pairs_mdd3$subcohort1[i]
    subcohort2 <- meta_qc_ldsc_pairs_mdd3$subcohort2[i]
    cohort_name1 <- paste(cohort1, subcohort1, sep='.')
    cohort_name2 <- paste(cohort2, subcohort2, sep='.')

    # if gcov_int is NA, substitute 0 (average)

    gcov_int <- coalesce(meta_qc_ldsc_pairs_mdd3$gcov_int[i], 0)

    gcov_int_mat[cohort_name1,cohort_name2] <- gcov_int 
    gcov_int_mat[cohort_name2,cohort_name1] <- gcov_int
}

Convert intercepts into a dissimilarity matrix. For intercepts, 1 == same, 0 == average, <0 == different. For dissimilarity, 0 == same and larger values == more dissimilar.

plot(hclust(as.dist(1-gcov_int_mat)))

corrplot(gcov_int_mat, is.corr=FALSE, diag=FALSE)

Genetic correlations

rg_mat <- diag(length(cohort_names))
dimnames(rg_mat) <- list(cohort_names, cohort_names)

for(i in seq.int(nrow(meta_qc_ldsc_pairs_mdd3))) {
    cohort1 <- meta_qc_ldsc_pairs_mdd3$cohort1[i]
    cohort2 <- meta_qc_ldsc_pairs_mdd3$cohort2[i]
    subcohort1 <- meta_qc_ldsc_pairs_mdd3$subcohort1[i]
    subcohort2 <- meta_qc_ldsc_pairs_mdd3$subcohort2[i]
    cohort_name1 <- paste(cohort1, subcohort1, sep='.')
    cohort_name2 <- paste(cohort2, subcohort2, sep='.')

    # if rg is NA, substitute 0 (average)

    rg <- meta_qc_ldsc_pairs_mdd3$rg[i]

    rg_mat[cohort_name1,cohort_name2] <- rg 
    rg_mat[cohort_name2,cohort_name1] <- rg
}
rg_mat_1 <- rg_mat
rg_mat_1[which(rg_mat_1 > 1)] <- 1
rg_mat_1[which(rg_mat_1 < -1)] <- -1

has_rg_idx <- rowSums(!is.na(rg_mat_1)) > 1

corrplot(rg_mat_1[has_rg_idx,has_rg_idx], na.label='.')